In [479]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [480]:
%matplotlib inline
In [481]:
samples = np.random.normal(3, 1, size=10000)
In [482]:
bins = int(np.sqrt(len(samples)))
_ = plt.hist(samples, bins=bins, normed=True, cumulative=False)
In [483]:
bins = int(np.sqrt(len(samples)))
_ = plt.hist(samples, bins=bins, normed=True, cumulative=True)
In [484]:
df = pd.read_csv('https://assets.datacamp.com/production/course_1550/datasets/mlb_nohitters.csv', parse_dates=True, header=0)
df.info()
In [485]:
df.head()
Out[485]:
In [486]:
df.date = pd.to_datetime(df.date,format='%Y%m%d')
df.winning_team = df.winning_team.astype('category')
df.losing_team = df.losing_team.astype('category')
df.info()
In [487]:
df.head()
Out[487]:
In [488]:
d1 = df[['date']].reset_index()
d1['index'] = d1['index'] + 1
# drop last row
d1 = d1[:-1].set_index('index')
d1_m = df.merge(d1, how='left', left_index=True, right_index=True, suffixes=('_c','_b'))
d1_m['days_between'] = pd.Series(d1_m['date_c'] - d1_m['date_b'])
d1_m.days_between = d1_m.days_between.fillna(pd.Timedelta('0 days')).map(lambda s: s.days)
d1_m = d1_m.drop(columns=['date_b'])
d1_m = d1_m.rename(columns={'date_c':'date'}).set_index('date')
# overwrite
df = d1_m
d1_m.head()
Out[488]:
In [489]:
n_ = len(df['days_between'])
x_ = np.sort(df['days_between'])
y_ = np.arange(1,n_+1) / n_
In [490]:
# team to have most nohitters
data = df.groupby('winning_team')[['game_number']].count().sort_values(by='game_number', ascending=False).reset_index()
data = data.rename(columns={'game_number':'no hitters', 'winning_team':'winning team'})
data.plot(kind='bar', x='winning team', y='no hitters')
plt.title('Teams with most no hitters')
plt.tight_layout()
In [491]:
# team to have most no hitter loses
data = df.groupby('losing_team')[['game_number']].count().sort_values(by='game_number', ascending=False).reset_index()
data = data.rename(columns={'game_number':'no hitters', 'losing_team':'losing team'})
data.plot(kind='bar', x='losing team', y='no hitters')
plt.title('Teams with most loses')
plt.tight_layout()
In [492]:
mu = df.days_between.mean()
std = df.days_between.std()
In [493]:
normal_samples = np.random.normal(mu, std, size=10000)
_ = plt.hist(df.days_between, bins=50, normed=True, alpha=.3, label='Real')
_ = plt.hist(normal_samples, bins=50, normed=True, histtype='step', color='black', label='Theoretical')
plt.legend()
plt.margins(0.02)
plt.show()
In [494]:
nt_ = len(normal_samples)
xt_ = np.sort(normal_samples)
yt_ = np.arange(1,nt_+1) / nt_
_ = plt.plot(x_, y_, marker='.', linestyle='none', label='Real')
_ = plt.plot(xt_, yt_, alpha=.5, label='Theoretical')
plt.legend(loc='lower right')
plt.margins(0.02)
plt.show()
In [495]:
# awful...
In [496]:
df.days_between.mean()
Out[496]:
In [497]:
tau = df.days_between.mean()
exp_samples = np.random.exponential(tau, size=10000)
_ = plt.hist(df.days_between, bins=50, normed=True, alpha=.3, label='Real')
_ = plt.hist(exp_samples, bins=50, normed=True, histtype='step', color='black', label='Theoretical')
plt.legend()
plt.margins(0.02)
In [498]:
nt_ = len(exp_samples)
xt_ = np.sort(exp_samples)
yt_ = np.arange(1,nt_+1) / nt_
_ = plt.plot(x_, y_, marker='.', linestyle='none', label='Real')
_ = plt.plot(xt_, yt_, alpha=.5, label='Theoretical')
plt.legend(loc='lower right')
plt.margins(0.02)
plt.show()
In [499]:
# exp. makes more sense...
In [500]:
# standard error of the mean, std.
sem = np.std(df.days_between) / np.sqrt(len(df.days_between))
print('population')
print('sem:', sem)
print('mu:', np.mean(df.days_between))
In [501]:
# sampling
samples = []
size = len(df.days_between)
for i in range(10000):
samples.append(np.random.choice(df.days_between, size=size).mean())
s_mu = np.mean(samples)
s_std = np.std(samples)
print('sampling')
print('std:', s_std)
print('mean:', s_mu)
In [502]:
np.percentile(samples, [2.5,97.5])
Out[502]:
In [503]:
bins = int(np.sqrt(len(samples)))
_ = plt.hist(samples, normed=True, bins=bins)